{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup Amazon Comprehend Through AWS Console "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<img src=\"img/comprehend.png\" width=\"80%\" align=\"left\">"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Note that Amazon Comprehend is currently only supported in a subset of regions: \n",
    "\n",
    "* US East (N. Virginia), US East (Ohio), US West (Oregon)\n",
    "* Canada (Central)\n",
    "* Europe (London), Europe (Ireland), Europe (Frankfurt)\n",
    "* Asia Pacific (Mumbai), Asia Pacific (Seoul), Asia Pacific (Tokyo), Asia Pacific (Singapore), Asia Pacific (Sydney)\n",
    "\n",
    "You can check https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/ for details and updates. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "import sagemaker\n",
    "import pandas as pd\n",
    "\n",
    "sess   = sagemaker.Session()\n",
    "bucket = sess.default_bucket()\n",
    "role = sagemaker.get_execution_role()\n",
    "region = boto3.Session().region_name\n",
    "\n",
    "from botocore.config import Config\n",
    "\n",
    "config = Config(\n",
    "   retries = {\n",
    "      'max_attempts': 10,\n",
    "      'mode': 'adaptive'\n",
    "   }\n",
    ")\n",
    "\n",
    "iam = boto3.client('iam', config=config)\n",
    "sm = boto3.Session().client(service_name='sagemaker', region_name=region)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Check if you current regions supports Comprehend"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if region in ['ap-south-1', 'eu-west-2', 'eu-west-1', 'ap-northeast-2', 'ap-northeast-1', 'ca-central-1', 'ap-southeast-1', 'ap-southeast-2', 'eu-central-1', 'us-east-1', 'us-east-2', 'us-west-2']:\n",
    "    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
    "    print(' SUCCESS: COMPREHEND IS SUPPORTED IN {}'.format(region))\n",
    "    print(' Please proceed with this notebook.' )\n",
    "    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
    "else:\n",
    "    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )\n",
    "    print(' !! COMPREHEND IS *NOT* SUPPORTED IN {}!! '.format(region))\n",
    "    print(' This is OK. Skip this notebook and continue with the next notebook.' )\n",
    "    print(' This notebook is not required for the rest of this workshop.' )\n",
    "    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++' )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comprehend = boto3.client('comprehend')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Retrieve S3 location of training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store -r noheader_train_s3_uri"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not noheader_train_s3_uri:\n",
    "    print('****************************************************************************************')\n",
    "    print('**************** PLEASE RE-RUN THE PREVIOUS DATA PREPARATION NOTEBOOK ******************')\n",
    "    print('**************** THIS NOTEBOOK WILL NOT RUN PROPERLY ***********************************')\n",
    "    print('****************************************************************************************')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(noheader_train_s3_uri)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 ls $noheader_train_s3_uri"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## See our prepared training data which we use as input for Comprehend"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 cp $noheader_train_s3_uri ./data/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "\n",
    "df = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00_noheader.csv', header=None)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create Data Access Role for Comprehend"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Policy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assume_role_policy_doc = {\n",
    "  \"Version\": \"2012-10-17\",\n",
    "  \"Statement\": [\n",
    "    {\n",
    "      \"Effect\": \"Allow\",\n",
    "      \"Principal\": {\n",
    "        \"Service\": \"comprehend.amazonaws.com\"\n",
    "      },\n",
    "      \"Action\": \"sts:AssumeRole\"\n",
    "    }\n",
    "  ]\n",
    "} "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create Role and Attach Policies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "iam_comprehend_role_name = 'DSOAWS_Comprehend'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",
    "\n",
    "from botocore.exceptions import ClientError\n",
    "\n",
    "try:\n",
    "    iam_role_comprehend = iam.create_role(\n",
    "        RoleName=iam_comprehend_role_name,\n",
    "        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),\n",
    "        Description='DSOAWS Comprehend Role'\n",
    "    )\n",
    "except ClientError as e:\n",
    "    if e.response['Error']['Code'] == 'EntityAlreadyExists':\n",
    "        iam_role_comprehend = iam.get_role(RoleName=iam_comprehend_role_name)\n",
    "        print(\"Role already exists\")\n",
    "    else:\n",
    "        print(\"Unexpected error: %s\" % e)\n",
    "        \n",
    "time.sleep(30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comprehend_s3_policy_doc = {\n",
    "    \"Version\": \"2012-10-17\",\n",
    "    \"Statement\": [\n",
    "        {\n",
    "            \"Action\": [\n",
    "                \"s3:GetObject\"\n",
    "            ],\n",
    "            \"Resource\": [\n",
    "                \"arn:aws:s3:::{}/*\".format(bucket)\n",
    "            ],\n",
    "            \"Effect\": \"Allow\"\n",
    "        },\n",
    "        {\n",
    "            \"Action\": [\n",
    "                \"s3:ListBucket\"\n",
    "            ],\n",
    "            \"Resource\": [\n",
    "                \"arn:aws:s3:::{}\".format(bucket)\n",
    "            ],\n",
    "            \"Effect\": \"Allow\"\n",
    "        },\n",
    "        {\n",
    "            \"Action\": [\n",
    "                \"s3:PutObject\"\n",
    "            ],\n",
    "            \"Resource\": [\n",
    "                \"arn:aws:s3:::{}/*\".format(bucket)\n",
    "            ],\n",
    "            \"Effect\": \"Allow\"\n",
    "        }\n",
    "    ]\n",
    "}\n",
    "\n",
    "print(comprehend_s3_policy_doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Attach Policy to Role"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "response = iam.put_role_policy(\n",
    "    RoleName=iam_comprehend_role_name,\n",
    "    PolicyName='DSOAWS_ComprehendPolicyToS3',\n",
    "    PolicyDocument=json.dumps(comprehend_s3_policy_doc)\n",
    ")\n",
    "\n",
    "print(response)\n",
    "\n",
    "time.sleep(30)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train the Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prefix = 'models'\n",
    "\n",
    "s3_output_job = 's3://{}/{}/{}'.format(bucket, prefix, 'comprehend/output')\n",
    "print(s3_output_job)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "iam_role_comprehend_arn = iam_role_comprehend['Role']['Arn']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import time\n",
    "\n",
    "timestamp = str(datetime.datetime.now().strftime(\"%s\"))\n",
    "\n",
    "comprehend_training_job_name = 'Amazon-Customer-Reviews-Classifier-{}'.format(timestamp) \n",
    "\n",
    "print(comprehend_training_job_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_job = comprehend.create_document_classifier(\n",
    "    DocumentClassifierName=comprehend_training_job_name,\n",
    "    DataAccessRoleArn=iam_role_comprehend_arn,\n",
    "    InputDataConfig={\n",
    "        'S3Uri': noheader_train_s3_uri\n",
    "    },\n",
    "    OutputDataConfig={\n",
    "        'S3Uri': s3_output_job\n",
    "    },\n",
    "    LanguageCode='en'\n",
    ")\n",
    "\n",
    "time.sleep(30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "comprehend_training_job_arn = training_job['DocumentClassifierArn']\n",
    "\n",
    "print(comprehend_training_job_arn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "\n",
    "display(HTML('<b>Review <a target=\"blank\" href=\"https://console.aws.amazon.com/comprehend/v2/home?region={}#classifier-details/{}\">Comprehend Training Job</a></b>'.format(region, comprehend_training_job_arn)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "\n",
    "max_time = time.time() + 3 * 60 * 60 # 3 hours\n",
    "while time.time() < max_time:\n",
    "    describe_custom_classifier = comprehend.describe_document_classifier(\n",
    "        DocumentClassifierArn = comprehend_training_job_arn\n",
    "    )\n",
    "    status = describe_custom_classifier[\"DocumentClassifierProperties\"][\"Status\"]\n",
    "    print(\"Custom classifier: {}\".format(status))\n",
    "    \n",
    "    if status == \"TRAINED\" or status == \"IN_ERROR\":\n",
    "        print('')\n",
    "        print('Status {}'.format(status))\n",
    "        print('')\n",
    "        print(describe_custom_classifier[\"DocumentClassifierProperties\"])\n",
    "        break\n",
    "        \n",
    "    time.sleep(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# _Please Wait Until the ^^ Classifier ^^ is Trained Above._"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Show Results of the Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(describe_custom_classifier[\"DocumentClassifierProperties\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_arn = describe_custom_classifier[\"DocumentClassifierProperties\"][\"DocumentClassifierArn\"]\n",
    "print(model_arn)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "#Retrieve the S3URI from the model output and create jobkey variable.\n",
    "job_output = describe_custom_classifier[\"DocumentClassifierProperties\"][\"OutputDataConfig\"][\"S3Uri\"]\n",
    "print(job_output)\n",
    "\n",
    "path_prefix = 's3://{}/'.format(bucket)\n",
    "\n",
    "job_key = os.path.relpath(job_output, path_prefix)\n",
    "\n",
    "print(job_key)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Download Model Artifacts including Training Metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s3 = boto3.resource('s3')\n",
    "\n",
    "s3.Bucket(bucket).download_file(job_key, './output.tar.gz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Unpack the gzip file\n",
    "!tar xvzf ./output.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('./output/confusion_matrix.json') as json_file:\n",
    "    data = json.load(json_file)\n",
    "print(json.dumps(data, indent=2, default=str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install tabulate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.display import HTML, display\n",
    "import tabulate\n",
    "table = [['', '1', '2', '3', '4', '5', '(Predicted)'],\n",
    "         ['1', data['confusion_matrix'][0][0], data['confusion_matrix'][0][1], data['confusion_matrix'][0][2], data['confusion_matrix'][0][3], data['confusion_matrix'][0][4]],\n",
    "         ['2', data['confusion_matrix'][1][0], data['confusion_matrix'][1][1], data['confusion_matrix'][1][2], data['confusion_matrix'][1][3], data['confusion_matrix'][1][4]],\n",
    "         ['3', data['confusion_matrix'][2][0], data['confusion_matrix'][2][1], data['confusion_matrix'][2][2], data['confusion_matrix'][2][3], data['confusion_matrix'][2][4]],\n",
    "         ['4', data['confusion_matrix'][3][0], data['confusion_matrix'][3][1], data['confusion_matrix'][3][2], data['confusion_matrix'][3][3], data['confusion_matrix'][3][4]],\n",
    "         ['5', data['confusion_matrix'][4][0], data['confusion_matrix'][4][1], data['confusion_matrix'][4][2], data['confusion_matrix'][4][3], data['confusion_matrix'][4][4]],\n",
    "         ['(Actual)']]\n",
    "display(HTML(tabulate.tabulate(table, tablefmt='html')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Deploy Endpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from time import gmtime, strftime, sleep\n",
    "timestamp_suffix = strftime('%d-%H-%M-%S', gmtime())\n",
    "\n",
    "comprehend_endpoint_name = 'comprehend-inference-ep-' + timestamp_suffix\n",
    "\n",
    "inference_endpoint_response = comprehend.create_endpoint(\n",
    "    EndpointName=comprehend_endpoint_name,\n",
    "    ModelArn=model_arn,\n",
    "    DesiredInferenceUnits=1\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "comprehend_endpoint_arn = inference_endpoint_response[\"EndpointArn\"]\n",
    "print(comprehend_endpoint_arn)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pass Variables to the Next Notebook(s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%store comprehend_endpoint_arn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predict with Endpoint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "describe_response = comprehend.describe_endpoint(\n",
    "    EndpointArn = comprehend_endpoint_arn\n",
    ")\n",
    "print(describe_response)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "\n",
    "display(HTML('<b>Review <a target=\"blank\" href=\"https://console.aws.amazon.com/comprehend/v2/home?region={}#classifier-details/{}/endpoints/{}/details\">Comprehend Model Endpoint</a></b>'.format(region, comprehend_training_job_arn, comprehend_endpoint_arn)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "\n",
    "max_time = time.time() + 3*60*60 # 3 hours\n",
    "while time.time() < max_time:\n",
    "    describe_response = comprehend.describe_endpoint(\n",
    "        EndpointArn = comprehend_endpoint_arn\n",
    "    )\n",
    "    status = describe_response[\"EndpointProperties\"][\"Status\"]\n",
    "    print(\"Endpoint: {}\".format(status))\n",
    "    \n",
    "    if status == \"IN_SERVICE\" or status == \"IN_ERROR\":\n",
    "        break\n",
    "        \n",
    "    time.sleep(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "txt = \"\"\"I loved it!  I will recommend this to everyone.\"\"\"\n",
    "\n",
    "response = comprehend.classify_document(\n",
    "    Text= txt,\n",
    "    EndpointArn = comprehend_endpoint_arn\n",
    ")\n",
    "\n",
    "import json\n",
    "print(json.dumps(response, indent=2, default=str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "txt = \"\"\"It's OK.\"\"\"\n",
    "\n",
    "response = comprehend.classify_document(\n",
    "    Text= txt,\n",
    "    EndpointArn = comprehend_endpoint_arn\n",
    ")\n",
    "\n",
    "import json\n",
    "print(json.dumps(response, indent=2, default=str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "txt = \"\"\"Really bad.  I hope they don't make this anymore.\"\"\"\n",
    "\n",
    "response = comprehend.classify_document(\n",
    "    Text= txt,\n",
    "    EndpointArn = comprehend_endpoint_arn\n",
    ")\n",
    "\n",
    "import json\n",
    "print(json.dumps(response, indent=2, default=str))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "\n",
    "display(HTML('<b>Review <a target=\"blank\" href=\"https://console.aws.amazon.com/comprehend/v2/home?region={}#classifier-details/{}/endpoints/{}/details\">Comprehend Model Endpoint</a></b>'.format(region, comprehend_training_job_arn, comprehend_endpoint_arn)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%javascript\n",
    "Jupyter.notebook.save_checkpoint();\n",
    "Jupyter.notebook.session.delete();"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
